library(DEXSeq)
library(tidyverse)
library(dplyr)
library(msigdbr)
library(fgsea)DEXSeq Isoform Analysis
Load Libraries
source("99_proj_func.R")Ocular Motor Neurons
isoCm_Ocular <- read.csv("data/OcularIsoformCountMatrix.csv", row.names = 1)
Ocular_Metadata <- read.csv("data/OcularSampleMetadata.csv", row.names = 1)
IsoformInfo <- read.csv("data/isoformInfo.csv")OcularResult <- dexseq_analysis(isoCm_Ocular, Ocular_Metadata, IsoformInfo)converting counts to integer mode
Warning in DESeqDataSet(rse, design, ignoreRank = TRUE): some variables in
design formula are characters, converting to factors
Fit for gene/exon ENSG00000234651 threw the next warning(s): the matrix is either rank-deficient or not positive definite
Compare to Differential Expression:
alpha <- 0.05 # threshold
OcularResultGene <- OcularResult |>
as.data.frame() |>
as_tibble() |>
filter(!is.na(pvalue)) |>
select(
gene_id = groupID,
isoform_id = featureID,
stat,
log2FC = log2fold_SOD1_C9ORF72,
pvalue,
padj
) |>
group_by(gene_id) |>
slice_min(
order_by = pvalue,
n = 1,
with_ties = FALSE
) |>
ungroup() |>
mutate(
padj = p.adjust(pvalue, method = 'fdr')
) |>
filter(!is.na(padj))
head(OcularResultGene)# A tibble: 6 × 6
gene_id isoform_id stat log2FC pvalue padj
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ENSG00000000003 ENST00000612152 0.663 -1.38 0.718 0.802
2 ENSG00000000419 ENST00000371582 15.4 -0.441 0.000459 0.00457
3 ENSG00000000457 ENST00000423670 4.38 0.632 0.112 0.262
4 ENSG00000000460 ENST00000459772 2.89 0.462 0.236 0.410
5 ENSG00000000971 ENST00000695981 2.25 2.06 0.324 0.494
6 ENSG00000001036 ENST00000451668 5.16 -0.963 0.0757 0.205
dex_gene <- OcularResultGene |>
mutate(gene_id,
dex_padj = padj,
dex_sig = !is.na(padj) & padj < alpha)
head(dex_gene)# A tibble: 6 × 8
gene_id isoform_id stat log2FC pvalue padj dex_padj dex_sig
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 ENSG00000000003 ENST00000612152 0.663 -1.38 7.18e-1 0.802 0.802 FALSE
2 ENSG00000000419 ENST00000371582 15.4 -0.441 4.59e-4 0.00457 0.00457 TRUE
3 ENSG00000000457 ENST00000423670 4.38 0.632 1.12e-1 0.262 0.262 FALSE
4 ENSG00000000460 ENST00000459772 2.89 0.462 2.36e-1 0.410 0.410 FALSE
5 ENSG00000000971 ENST00000695981 2.25 2.06 3.24e-1 0.494 0.494 FALSE
6 ENSG00000001036 ENST00000451668 5.16 -0.963 7.57e-2 0.205 0.205 FALSE
load("data/Ocular_dseqResult.RData")
OcularDesq <- res_ocular
OcularDesq$padj <- as.numeric(OcularDesq$padj)OcularDesq <- OcularDesq |>
as.data.frame() |>
rownames_to_column("gene_id") |>
mutate(deseq_sig = !is.na(padj) & padj < alpha) |>
filter(!is.na(padj))
head(OcularDesq) gene_id baseMean log2FoldChange lfcSE stat pvalue
1 ENSG00000000003 12926.011840 0.10096750 0.4588043 0.22006659 0.8258193
2 ENSG00000000005 18.828508 -0.61861603 0.8402902 -0.73619336 0.4616130
3 ENSG00000000419 4025.832134 0.21512399 0.2723724 0.78981561 0.4296355
4 ENSG00000000457 1748.634605 0.20570153 0.3076286 0.66866831 0.5037071
5 ENSG00000000460 1272.541630 0.46317296 0.4467637 1.03672916 0.2998621
6 ENSG00000000938 6.719304 -0.05569774 2.1012478 -0.02650698 0.9788530
padj deseq_sig
1 0.9982948 FALSE
2 0.9982948 FALSE
3 0.9982948 FALSE
4 0.9982948 FALSE
5 0.9982948 FALSE
6 0.9982948 FALSE
# Gene universe = genes present in BOTH results
univ <- intersect(dex_gene$gene_id, OcularDesq$gene_id)
both <- dex_gene |>
filter(gene_id %in% univ) |>
mutate(gene_id, dex_sig) |>
inner_join(OcularDesq |>
filter(gene_id %in% univ) |>
mutate(gene_id, deseq_sig),
by = "gene_id")
contig_tab <- table(DEXSeq = both$dex_sig, DESeq2 = both$deseq_sig)
contig_tab DESeq2
DEXSeq FALSE TRUE
FALSE 14243 22
TRUE 2853 527
Fisher Test
ft <- fisher.test(contig_tab, alternative = "greater")
ft
Fisher's Exact Test for Count Data
data: contig_tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
82.62826 Inf
sample estimates:
odds ratio
119.5491
There is very strong overlap between genes detected by DEXSeq and DESeq2.
When DEXSeq identifies a gene with differential isoform usage, that gene is much more likely than random also to show differential expression at the gene level.
Power is different across methods—DEXSeq finds many more significant genes (isoform-level testing is more sensitive), but the ones shared with DESeq2 are highly enriched.
Compare to Differential Expression at Gene-set Level
BP_df = msigdbr(species = "human", category = "C5", subcategory = "BP")
BP_list = split(BP_df$ensembl_gene, BP_df$gs_name)deseq_names <- sub("\\.\\d+$", "", OcularDesq$gene_id)
deseq_vals <- OcularDesq$stat
stats_deseq <- setNames(deseq_vals, deseq_names)
stats_deseq <- stats_deseq[!is.na(stats_deseq)]
stats_deseq <- sort(stats_deseq, decreasing = TRUE)
dex_names <- sub("\\.\\d+$", "", dex_gene$gene_id)
dex_vals <- dex_gene$stat
stats_dex <- setNames(dex_vals, dex_names)
stats_dex <- stats_dex[!is.na(stats_dex)]
stats_dex <- sort(stats_dex, decreasing = TRUE)fg_deseq <- fgseaMultilevel(pathways = BP_list, stats = stats_deseq,
minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
dim(fg_deseq)[1] 4244 8
fg_dex <- fgseaMultilevel(pathways = BP_list, stats = stats_dex,
minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.92% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: There were 8 pathways for which P-values were not calculated properly due to
unbalanced (positive and negative) gene-level statistic values. For such
pathways pval, padj, NES, log2err are set to NA. You can try to increase the
value of the argument nPermSimple (for example set it nPermSimple = 10000)
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: For some of the pathways the P-values were likely overestimated. For such
pathways log2err is set to NA.
fg_dex_ocular <- fg_dex
dim(fg_dex)[1] 3734 8
sig_deseq <- fg_deseq |> filter(padj < alpha) |> arrange(padj)
sig_dex <- fg_dex |> filter(padj < alpha) |> arrange(padj)
n_sig_de <- nrow(sig_deseq)
n_sig_dx <- nrow(sig_dex)
# shared pathways (by name)
shared <- intersect(sig_deseq$pathway, sig_dex$pathway)
n_shared <- length(shared)
n_sig_de[1] 1075
n_sig_dx[1] 5
n_shared[1] 2
shared[1] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
[2] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
de_df <- fg_deseq |> transmute(pathway, NES_de = NES, padj_de = padj)
dx_df <- fg_dex |> transmute(pathway, NES_dx = NES, padj_dx = padj)
# Merge on pathway (shared sets only)
cmp <- inner_join(de_df, dx_df, by = "pathway") %>%
mutate(sig_cat = case_when(
padj_de < alpha & padj_dx < alpha ~ "Both",
padj_de < alpha & padj_dx >= alpha ~ "DESeq2 only",
padj_de >= alpha & padj_dx < alpha ~ "DEXSeq only",
TRUE ~ "None"
)) %>%
mutate(sig_cat = factor(sig_cat, levels = c("Both","DESeq2 only","DEXSeq only","None")))
cmp pathway NES_de
<char> <num>
1: GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS 1.6815239
2: GOBP_3_PHOSPHOADENOSINE_5_PHOSPHOSULFATE_METABOLIC_PROCESS -1.2756852
3: GOBP_3_UTR_MEDIATED_MRNA_STABILIZATION 1.3086038
4: GOBP_ACETYL_COA_BIOSYNTHETIC_PROCESS 1.7231721
5: GOBP_ACETYL_COA_METABOLIC_PROCESS 1.7733249
---
3655: GOBP_XENOBIOTIC_METABOLIC_PROCESS 1.2992393
3656: GOBP_XENOBIOTIC_TRANSPORT 0.6645772
3657: GOBP_ZINC_ION_HOMEOSTASIS -1.3121420
3658: GOBP_ZINC_ION_TRANSPORT -1.3287925
3659: GOBP_ZYMOGEN_ACTIVATION 0.8576990
padj_de NES_dx padj_dx sig_cat
<num> <num> <num> <fctr>
1: 0.05686885 0.9598756 1.0000000 None
2: 0.33066785 1.0705436 1.0000000 None
3: 0.28263335 0.9706749 1.0000000 None
4: 0.05435541 1.1690774 1.0000000 None
5: 0.03164361 1.0416604 1.0000000 DESeq2 only
---
3655: 0.20220078 0.9585218 1.0000000 None
3656: 0.95565365 1.1842482 1.0000000 None
3657: 0.24151258 1.5104315 0.5993666 None
3658: 0.25098715 1.5625600 0.3884447 None
3659: 0.79088753 1.0223987 1.0000000 None
# Scatter + smooth
ggplot(cmp, aes(x = NES_de, y = NES_dx, color = sig_cat)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "NES (DESeq2 fgsea)",
y = "NES (DEXSeq fgsea)",
color = "Significance",
title = "NES comparison: DESeq2 vs DEXSeq (fgseaMultilevel)"
) +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 8 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).
save(OcularResult, file="data/Ocular_dxdResult.RData")Spinal Motor Neurons
Load Data
isoCm_Spinal <- read.csv("data/SpinalIsoformCountMatrix.csv", row.names = 1)
Spinal_Metadata <- read.csv("data/SpinalSampleMetadata.csv", row.names = 1)
IsoformInfo <- read.csv("data/isoformInfo.csv")SpinalResult <- dexseq_analysis(isoCm_Spinal, Spinal_Metadata, IsoformInfo)converting counts to integer mode
Warning in DESeqDataSet(rse, design, ignoreRank = TRUE): some variables in
design formula are characters, converting to factors
Compare to Differential Expression:
alpha <- 0.05 # example threshold
SpinalResultGene <- SpinalResult |>
as.data.frame() |>
as_tibble() |>
filter(!is.na(pvalue)) |>
select(
gene_id = groupID,
isoform_id = featureID,
stat,
log2FC = log2fold_SOD1_C9ORF72,
pvalue,
padj
) |>
group_by(gene_id) |>
slice_min(
order_by = pvalue,
n = 1,
with_ties = FALSE
) |>
ungroup() |>
mutate(
padj = p.adjust(pvalue, method = 'fdr')
) |>
filter(!is.na(padj))dex_gene <- SpinalResultGene |>
mutate(gene_id,
dex_padj = padj,
dex_sig = !is.na(padj) & padj < alpha)load("data/Spinal_dseqResult.RData")
SpinalDesq <- res_spinal
SpinalDesq$padj <- as.numeric(SpinalDesq$padj)SpinalDesq <- SpinalDesq |>
as.data.frame() |>
rownames_to_column("gene_id") |>
mutate(deseq_sig = !is.na(padj) & padj < alpha) |>
filter(!is.na(padj))# Gene universe = genes present in BOTH results
univ <- intersect(dex_gene$gene_id, SpinalDesq$gene_id)
both <- dex_gene %>%
filter(gene_id %in% univ) %>%
mutate(gene_id, dex_sig) %>%
inner_join(SpinalDesq %>%
filter(gene_id %in% univ) %>%
mutate(gene_id, deseq_sig), by = "gene_id")
contig_tab <- table(DEXSeq = both$dex_sig, DESeq2 = both$deseq_sig)
contig_tab DESeq2
DEXSeq FALSE TRUE
FALSE 9609 151
TRUE 5717 2569
Fisher Test
ft <- fisher.test(contig_tab, alternative = "greater")
ft
Fisher's Exact Test for Count Data
data: contig_tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
24.80359 Inf
sample estimates:
odds ratio
28.59148
Compare to Differential Expression at Gene-set Level
BP_df = msigdbr(species = "human", category = "C5", subcategory = "BP")
BP_list = split(BP_df$ensembl_gene, BP_df$gs_name)deseq_names <- sub("\\.\\d+$", "", SpinalDesq$gene_id)
deseq_vals <- SpinalDesq$stat
stats_deseq <- setNames(deseq_vals, deseq_names)
stats_deseq <- stats_deseq[!is.na(stats_deseq)]
stats_deseq <- sort(stats_deseq, decreasing = TRUE)
dex_names <- sub("\\.\\d+$", "", dex_gene$gene_id)
dex_vals <- dex_gene$stat
stats_dex <- setNames(dex_vals, dex_names)
stats_dex <- stats_dex[!is.na(stats_dex)]
stats_dex <- sort(stats_dex, decreasing = TRUE)fg_deseq <- fgseaMultilevel(pathways = BP_list, stats = stats_deseq,
minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.9% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fg_dex <- fgseaMultilevel(pathways = BP_list, stats = stats_dex,
minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.02% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: There were 5 pathways for which P-values were not calculated properly due to
unbalanced (positive and negative) gene-level statistic values. For such
pathways pval, padj, NES, log2err are set to NA. You can try to increase the
value of the argument nPermSimple (for example set it nPermSimple = 10000)
fg_dex_spinal <- fg_dex
sig_deseq <- fg_deseq %>% filter(padj < alpha) %>% arrange(padj)
sig_dex <- fg_dex %>% filter(padj < alpha) %>% arrange(padj)
n_sig_de <- nrow(sig_deseq)
n_sig_dx <- nrow(sig_dex)
# shared pathways (by name)
shared <- intersect(sig_deseq$pathway, sig_dex$pathway)
n_shared <- length(shared)
n_sig_de[1] 635
n_sig_dx[1] 2
n_shared[1] 2
shared[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"
de_df <- fg_deseq %>% transmute(pathway, NES_de = NES, padj_de = padj)
dx_df <- fg_dex %>% transmute(pathway, NES_dx = NES, padj_dx = padj)
# Merge on pathway (shared sets only)
cmp <- inner_join(de_df, dx_df, by = "pathway") %>%
mutate(sig_cat = case_when(
padj_de < alpha & padj_dx < alpha ~ "Both",
padj_de < alpha & padj_dx >= alpha ~ "DESeq2 only",
padj_de >= alpha & padj_dx < alpha ~ "DEXSeq only",
TRUE ~ "None"
)) %>%
mutate(sig_cat = factor(sig_cat, levels = c("Both","DESeq2 only","DEXSeq only","None")))# Scatter + smooth
ggplot(cmp, aes(x = NES_de, y = NES_dx, color = sig_cat)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "NES (DESeq2 fgsea)",
y = "NES (DEXSeq fgsea)",
color = "Significance",
title = "NES comparison: DESeq2 vs DEXSeq (fgseaMultilevel)"
) +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_point()`).
save(SpinalResult, file="data/Spinal_dxdResult.RData")iso_id_padj_value_ocular = data.frame("transcript_id" = OcularResult$featureID,
"padj" = OcularResult$padj)
iso_id_padj_value_spinal = data.frame("transcript_id" = SpinalResult$featureID,
"padj" = SpinalResult$padj)
write.csv(iso_id_padj_value_ocular, "data/iso_id_padj_value_ocular.csv")
write.csv(iso_id_padj_value_spinal, "data/iso_id_padj_value_spinal.csv")Investigation of Significant Pathways
Compare Pathways
head(OcularResultGene)# A tibble: 6 × 6
gene_id isoform_id stat log2FC pvalue padj
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ENSG00000000003 ENST00000612152 0.663 -1.38 0.718 0.802
2 ENSG00000000419 ENST00000371582 15.4 -0.441 0.000459 0.00457
3 ENSG00000000457 ENST00000423670 4.38 0.632 0.112 0.262
4 ENSG00000000460 ENST00000459772 2.89 0.462 0.236 0.410
5 ENSG00000000971 ENST00000695981 2.25 2.06 0.324 0.494
6 ENSG00000001036 ENST00000451668 5.16 -0.963 0.0757 0.205
head(SpinalResultGene)# A tibble: 6 × 6
gene_id isoform_id stat log2FC pvalue padj
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ENSG00000000003 ENST00000496771 1.54 -0.221 4.63e- 1 5.43e- 1
2 ENSG00000000419 ENST00000371584 55.1 -1.42 1.10e-12 1.77e-11
3 ENSG00000000457 ENST00000470238 2.15 0.346 3.41e- 1 4.28e- 1
4 ENSG00000000460 ENST00000413811 2.43 0.235 2.97e- 1 3.82e- 1
5 ENSG00000000971 ENST00000695986 33.2 15.0 6.05e- 8 5.45e- 7
6 ENSG00000001036 ENST00000367585 17.2 -0.705 1.81e- 4 8.18e- 4
fg_dex_spinal <- fg_dex_spinal |> mutate(sig_spinal = !is.na(padj) & padj < alpha)
fg_dex_ocular <- fg_dex_ocular |> mutate(sig_ocular = !is.na(padj) & padj < alpha)both <- fg_dex_spinal |>
left_join(fg_dex_ocular, by="pathway") |>
select(pathway, sig_spinal, sig_ocular) |>
mutate(sig = case_when(
sig_spinal & sig_ocular ~ "Both",
sig_spinal ~ "Spinal only",
sig_ocular ~ "Ocular only",
TRUE ~ "None"
)) both |> group_by(sig) |> count()# A tibble: 3 × 2
# Groups: sig [3]
sig n
<chr> <int>
1 None 3738
2 Ocular only 5
3 Spinal only 2
Pathways enriched in _ compared to _ [Spinal]
both |> filter(sig_spinal) |> pull(pathway)[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"
Pathways enriched in _ compared to _ [Ocular]
both |> filter(sig_ocular) |> pull(pathway)[1] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
[2] "GOBP_ARGININE_METABOLIC_PROCESS"
[3] "GOBP_GLUTAMINE_FAMILY_AMINO_ACID_CATABOLIC_PROCESS"
[4] "GOBP_NITRIC_OXIDE_MEDIATED_SIGNAL_TRANSDUCTION"
[5] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
Pathways enriched only in Ocular Motor Neurons
both |> filter(sig=='Ocular only') |> pull(pathway)[1] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
[2] "GOBP_ARGININE_METABOLIC_PROCESS"
[3] "GOBP_GLUTAMINE_FAMILY_AMINO_ACID_CATABOLIC_PROCESS"
[4] "GOBP_NITRIC_OXIDE_MEDIATED_SIGNAL_TRANSDUCTION"
[5] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
Pathways enriched only in Spinal Motor Neurons
both |> filter(sig=='Spinal only') |> pull(pathway)[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"
Pathways enriched in Both Motor Neurons
both |> filter(sig=='Both') |> pull(pathway)character(0)